# Replication code for "Following the trail of words: 
# mapping trending topics of graduate research in IR 
# in Brazil (1987-2018)"

############ In this code, we replicate the analysis itself, after
############ having the preprocessed data.

## Loading packages


if(require(readxl) == F) {install.packages("readxl");
  require(readxl)}
if(require(tidyr) == F) {install.packages("tidyr");
  require(tidyr)}
if(require(tidytext) == F) {install.packages("tidytext");
  require(tidytext)}
if(require(stm) == F) {install.packages("stm");
  require(stm)}
if(require(readtext) == F) {install.packages("readtext");
  require(readtext)}
if(require(quanteda) == F) {install.packages("quanteda");
  require(quanteda)}
if(require(dplyr) == F) {install.packages("dplyr");
  require(dplyr)}
if(require(ggplot2) == F) {install.packages("ggplot2");
  require(ggplot2)}
if(require(haven) == F) {install.packages("haven");
  require(haven)}
if(require(here) == F) {install.packages("here");
  require(here)}
if(require(lexiconPT) == F) {install.packages("lexiconPT");
  require(lexiconPT)}
if(require(reshape2) == F) {install.packages("reshape2");
  require(reshape2)}
if(require(ptstem) == F) {install.packages("ptstem");
  require(ptstem)}
if(require(stringr) == F) {install.packages("stringr");
  require(stringr)}
if(require(tm) == F) {install.packages("tm");
  require(tm)}
if(require(reshape2) == F) {install.packages("reshape2");
  require(reshape2)}
if(require(gridExtra) == F) {install.packages("gridExtra");
  require(gridExtra)}
if(require(gtable) == F) {install.packages("gtable");
  require(gtable)}
if(require(stargazer) == F) {install.packages("stargazer");
  require(stargazer)}
if(require(xlsx) == F) {install.packages("xlsx");
  require(xlsx)}
if(require(ggpubr) == F) {install.packages("ggpubr");
  require(ggpubr)}
if(require(extrafont) == F) {install.packages("extrafont");
  require(extrafont)}
loadfonts(device = "all")

## Loading workspace

load("replication_workspace.RData")

## Figure 1 - Number of IR graduate programs in Brazil (1998-2018)

cursos <- read.xlsx("cursos.pos.consolidado.xlsx", 1) %>% ## Loading dataset
  mutate(num = 1) %>% aggregate(num ~ prog2 + ano, sum, data=.) %>% ## Creating a variable and using it to aggregate data according to graduate program and year
  mutate(prog2 = ifelse(prog2 %in% c("ESTUDOS ESTRATÉGICOS DE DEFESA E DA SEGURANÇA"), 
                         "Defense and Strategic Studies", prog2),
         prog2 = ifelse(prog2 == "RELAÇÕES INTERNACIONAIS", 
                         "Relações Internacionais", prog2)) %>% ## Translating graduate programmes' names
  complete(prog2, ano=1998:2018, fill=list(num = 0)) %>% as.data.frame() %>% ## Replacing missing data with 0 and converting into a data frame
  filter(!prog2 == "CIÊNCIA POLÍTICA") ## Removing graduate programmes in Political Science

cursos %>% 
  ggplot(aes(ano, num, linetype=prog2, shape=prog2)) + geom_line() + ## Plotting the data
  labs(x="Year", y="Total of Graduate Programs", linetype = "Area", shape="Area") +
  theme_bw() + theme(legend.position = "bottom",
                     text=element_text(size=10,  family="Cambria"),
                     legend.margin=margin(0,0,0,0),
                     legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure1.png", height = 4, width = 6, units = "in")

rm(cursos)

## Figure 2 - The number of concluded theses and dissertations, from 1987 to 2018

df %>% mutate(num = 1) %>% aggregate(num ~ AnoBase, data=., sum, na.rm=T) %>%  ## Creating a variable and using it to aggregate data according to year
  ggplot(aes(AnoBase, num))+ geom_line()+ ## Ploting the data
  theme_linedraw() + theme(legend.position = "bottom",
                           text=element_text(size=10,  family="Cambria"),
                           legend.margin=margin(0,0,0,0),
                           legend.box.margin=margin(0, 0, 0, 0)) + 
  labs(y="Concluded theses and dissertations", x="Year") 

ggsave(filename = "Figure2.png", height = 4, width = 6, units = "in")

## We will now run STM. We begin by identifying the ideal number
## of topics. We use separate functions because it consumes a 
## great volume of memory and can eventually fail.
## We first set seed, as results may differ when repeating the same operation,
## and use the "searchK" function to identify the required statistics.

set.seed(3754890)

findingk <- searchK(out$documents, out$vocab, K=c(45, 50, 55, 60),
                    data=out$meta,
                    prevalence= ~ factor(AnoBase) + factor(SiglaIes) +
                      factor(NomePrograma),
                    init.type="Spectral", verbose=TRUE,
                    heldout.seed = 3754890)
findingk2 <- searchK(out$documents, out$vocab, K=c(53, 56, 68, 70),
                    data=out$meta,
                    prevalence= ~ factor(AnoBase) + factor(SiglaIes) +
                      factor(NomePrograma),
                    init.type="Spectral", verbose=TRUE,
                    heldout.seed = 3754890)
findingk3 <- searchK(out$documents, out$vocab, K=c(52, 67, 62, 65),
                     data=out$meta,
                     prevalence= ~ factor(AnoBase) + factor(SiglaIes) +
                       factor(NomePrograma),
                     init.type="Spectral", verbose=TRUE,
                     heldout.seed = 3754890)
findingk4 <- searchK(out$documents, out$vocab, K=c(48, 54, 58,  64),
                     data=out$meta,
                     prevalence= ~ factor(AnoBase) + factor(SiglaIes) +
                       factor(NomePrograma),
                     init.type="Spectral", verbose=TRUE,
                     heldout.seed = 3754890)
findingk5 <- searchK(out$documents, out$vocab, K=c(46, 47, 49, 51),
                     data=out$meta,
                     prevalence= ~ factor(AnoBase) + factor(SiglaIes) +
                       factor(NomePrograma),
                     init.type="Spectral", verbose=TRUE,
                     heldout.seed = 3754890)

k2 <- rbind(as.data.frame(findingk$results), as.data.frame(findingk2$results), 
            as.data.frame(findingk3$results), as.data.frame(findingk4$results), 
            as.data.frame(findingk5$results)) %>% ## Now we transform results in data frames and merge them into a single dataset
            mutate(exclus = as.numeric(exclus), semcoh = as.numeric(semcoh)) ## Transforming statistics in numeric

## Figure 3 - Exclusivity and Semantic Coherence, 
## based on the number of topics

k2 %>%  ggplot(aes(semcoh, exclus)) + geom_text(aes(label=K)) + ## Ploting the data
  theme_linedraw() +
  labs(x="Semantic Coherence", y="Exclusivity") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure3.png", height = 4.5, width = 6, units = "in")

## Then, as results pointed to the model with 48 topics as overperforming
## the others, we run it. Topic labels are available in the Supplementary
## Material.

set.seed(3754890)

topic2 <- stm(documents = out$documents, vocab = out$vocab, ## Running STM
              K = 49, data = out$meta,
              prevalence= ~ factor(AnoBase) + factor(SiglaIes) +
                factor(NomePrograma),
              init.type = "Spectral", set.seed(3754890))

summary(topic2)

## We export STM results to a database

df2 <- make.dt(topic2, meta=out$meta) ## Converting STM results into a data frame

## And finally we plot the topics.

prev <- df2 %>% select(-docnum, -Regiao, -Uf, -SiglaIes, -NomePrograma, -Autor, -TituloTese,
                       -Nivel, -DataDefesa, -PalavrasChave, -Idioma, -ResumoTese, -LinhaPesquisa,
                       -orientador, - Topic1, -Topic2, -Topic6) ## Removing unnecessary variables


prev <- prev %>% 
  melt(id.vars = "AnoBase") %>% ## Labelling topics
  mutate(variable = recode(variable,
                           "Topic3" = "Industry, Technology, and Innovation",
                           "Topic4" = "Foreign Policy Decision-Making",
                           "Topic5" = "Agricultural Negotiations",
                           "Topic7" = "Southern Cone States",
                           "Topic8" = "Bi and Multilateral Relations",
                           "Topic9" = "Refugees",
                           "Topic10" = "Strategic Studies",
                           "Topic11" = "Environment and Climate Change",
                           "Topic12" = "Health and Education Policies",
                           "Topic13" = "International Cooperation",
                           "Topic14" = "European Studies", 
                           "Topic15" = "History",
                           "Topic16" = "Amazon and Borders",
                           "Topic17" = "International Economy",
                           "Topic18" = "Multilateral Trade Regimes",
                           "Topic19" = "Terrorism",
                           "Topic20" = "Gender Studies",
                           "Topic21" = "Portuguese Africa",
                           "Topic22" = "Inter-American HR System", 
                           "Topic23" = "National Defense",
                           "Topic24" = "Migration Flows",
                           "Topic25" = "Peace Operations",
                           "Topic26" = "UN Security Council",
                           "Topic27" = "Traditional Communities",
                           "Topic28" = "Energy Resources",
                           "Topic29" = "US Foreign Policy",
                           "Topic30" = "South American Integration",
                           "Topic31" = "Humanitarian Intervention",
                           "Topic32" = "Nuclear Regime", 
                           "Topic33" = "Intl. Security and Drug Trafficking",
                           "Topic34" = "Diplomacy",
                           "Topic35" = "International Economic Crises",
                           "Topic36" = "Post-Positivism",
                           "Topic37" = "Subnational Actors",
                           "Topic38" = "Southern Cone Integration",
                           "Topic39" = "Asian Powers",
                           "Topic40" = "Foreign Investment",
                           "Topic41" = "Natl. Perception and Public Opinion",
                           "Topic42" = "Political Regime",
                           "Topic43" = "International Law",
                           "Topic44" = "International Trade",
                           "Topic45" = "Non-state Actors",
                           "Topic46" = "Geopolitics in the Atlantic",
                           "Topic47" = "Brazilian Foreign Policy",
                           "Topic48" = "Latin American Integration",
                           "Topic49" = "MINUSTAH"))
  
## Figure 4 - The most prevalent topics in Brazilian IR theses and dissertations (1987-2018)

topprev <- aggregate(value ~ variable, mean, data=prev) ## Aggregating data according the the topics
topprev %>% filter(value > 0.023286) %>% ## Ploting the data
  ggplot(aes(reorder(variable, value), value)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x="Topic", y="Mean Prevalence") +
  theme_bw() +
  theme(text=element_text(size=13,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure4.png", height = 3.5, width = 5, units = "in")

rm(topprev)

## Figure 5 - Topics with the highest average prevalence per year

b88 <- filter(prev, AnoBase == 1988) %>% aggregate(value ~ variable, mean, data=.) %>%
  top_n(., n=5, value) %>% ## Filtering only the top5 topics mentioned in 1988 and plotting it
  ggplot(aes(x=reorder(variable, value), y=value)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x="Topic", y="Mean Prevalence") +
  theme_bw() + ggtitle(1988) +
  theme(text=element_text(size=13,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

## Repeating for other years

b98 <- filter(prev, AnoBase == 1998) %>% aggregate(value ~ variable, mean, data=.) %>%
  top_n(., n=5, value) %>%
  ggplot(aes(x=reorder(variable, value), y=value)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x="Topic", y="Mean Prevalence") +
  theme_bw() + ggtitle(1998) +
  theme(text=element_text(size=13,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

b08 <- filter(prev, AnoBase == 2008) %>% aggregate(value ~ variable, mean, data=.) %>%
  top_n(., n=5, value) %>%
  ggplot(aes(x=reorder(variable, value), y=value)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x="Topic", y="Mean Prevalence") +
  theme_bw() + ggtitle(2008) +
  theme(text=element_text(size=13,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

b18 <- filter(prev, AnoBase == 2018) %>% aggregate(value ~ variable, mean, data=.) %>%
  top_n(., n=5, value) %>%
  ggplot(aes(x=reorder(variable, value), y=value)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x="Topic", y="Mean Prevalence") +
  theme_bw() + ggtitle(2018) +
  theme(text=element_text(size=13,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

grid.arrange(b88, b98, b08, b18, nrow = 2)

jpeg(file = "Figure5.jpeg",   # The directory you want to save the file in
    width = 800, # The width of the plot in inches
    height = 500, units = "px", quality = 1000) # The height of the plot in inches
grid.arrange(b88, b98, b08, b18, nrow = 2)
dev.off()

rm(b88, b98, b08, b18)

## Figure 6 - Prevalence of documents in different topics

prev %>% filter(!variable %in% c("Topic1", "Topic2", "Topic6")) %>% ## Removing non-identifiable topics
  filter(value > 0.1) %>% ## Filtering to see only the papers in which topics were actually present
  ggplot(aes(AnoBase, value)) + 
  geom_point() +
  facet_wrap( ~ variable, ncol=7) + theme_bw() +
  labs(x="Year", y="Prevalence") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure6.png", height = 8, width = 14, units = "in")

## Figure 7 - Trends in the average prevalence of topics related to foreign policy

prev$value <- scale(prev$value) ## Standardizing values
prev %>% filter(variable %in% c("Foreign Policy Decision-Making", 
                                "Bi and Multilateral Relations", "Cultural Diplomacy",
                                "Natl. Perception and Public Opinion", 
                                "Non-state Actors",
                                "Brazilian Foreign Policy")) %>% ## Filtering to keep only foreign policy-related topics (We will do the same in the following plots)
  ggplot(aes(AnoBase, value)) +
  geom_smooth(se=TRUE, show.legend = TRUE, color = "black") +
  facet_wrap( ~ variable) + theme_bw() +
  labs(x="Year", y="Mean Prevalence") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure7.png", height = 4.5, width = 7.5, units = "in")

## Figure 8 - Trends in the average prevalence of topics linked to regional and major power studies

prev %>% filter(variable %in% c("Southern Cone States", "European Studies",
                                "Amazon and Borders", "Portuguese Africa", 
                                "US Foreign Policy",
                                "Latin American Integration", 
                                "Southern Cone Integration",
                                "Asian Powers", "South American Integration")) %>%
  ggplot(aes(AnoBase, value)) + 
  geom_smooth(se=TRUE, show.legend = TRUE, color = "black") +
  facet_wrap( ~ variable) + theme_bw() +
  labs(x="Year", y="Mean Prevalence") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure8.png", height = 4.5, width = 7.5, units = "in")

## Figure 9 - Trends in the average prevalence of topics linked to the International Political Economy

prev %>% filter(variable %in% c("Industry, Technology, and Innovation", 
                                "Agricultural Negotiations", 
                                "International Cooperation", 
                                "International Economy",
                                "Multilateral Trade Regimes", "Energy Resources",
                                "International Economic Crises", 
                                "Foreign Investment",
                                "International Trade")) %>%
  ggplot(aes(AnoBase, value)) + 
  geom_smooth(se=TRUE, show.legend = TRUE, color = "black") +
  facet_wrap( ~ variable) + theme_bw() +
  labs(x="Year", y="Mean Prevalence") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure9.png", height = 4.5, width = 7.5, units = "in")

## Figure 10 - Trends in the average prevalence of topics related to international security and defense

prev %>% filter(variable %in% c("Strategic Studies", "Terrorism", 
                                "National Defense", "Peace Operations",
                                "UN Security Council", "Humanitarian Intervention", 
                                "Nuclear Regime",
                                "Intl. Security and Drug Trafficking", "MINUSTAH")) %>%
  ggplot(aes(AnoBase, value)) + 
  geom_smooth(se=TRUE, show.legend = TRUE, color = "black") +
  facet_wrap( ~ variable) + theme_bw() +
  labs(x="Year", y="Mean Prevalence") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure10.png", height = 4.5, width = 7.5, units = "in")

## Figure 11 - Trends in the average prevalence of topics related to human rights, migration, environment, and post-positivism

prev %>% filter(variable %in% c("Traditional Communities", "Gender Studies", 
                                "Inter-American HR System",
                                "International Law", "Refugees", 
                                "Migration Flows", "Post-Positivism",
                                "Environment and Climate Change")) %>%
  ggplot(aes(AnoBase, value)) +
  geom_smooth(se=TRUE, show.legend = TRUE, color = "black") +
  facet_wrap( ~ variable) + theme_bw() +
  labs(x="Year", y="Mean Prevalence") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure11.png", height = 4.5, width = 7.5, units = "in")

## We now prepare the data for the analysis regarding the geographical scope

df$TituloTese <- tolower(df$TituloTese) ## converting all characters to lowers in titles
df$PalavrasChave <- tolower(df$PalavrasChave) ## doing the same for keywords

bras <- c("brasil*", "peb", "brasileir*") ## words related to "Brazil"
br <- as.data.frame(str_detect(df$ResumoTese, paste(bras, collapse = "|"))) %>%
  rename(brasil = `str_detect(df$ResumoTese, paste(bras, collapse = \"|\"))`) ## Detecting this set of words in abstracts
br2 <- as.data.frame(str_detect(df$TituloTese, paste(bras, collapse = "|"))) %>%
  rename(brasil2 = `str_detect(df$TituloTese, paste(bras, collapse = \"|\"))`) ## Detecting this set of words in titles
br3 <- as.data.frame(str_detect(df$PalavrasChave, paste(bras, collapse = "|"))) %>%
  rename(brasil3 = `str_detect(df$PalavrasChave, paste(bras, collapse = \"|\"))`) ## Detecting this set of words in keywords

## Now we do the same for Latin America, South America, Americas, Southern Cone, and other regions

latam <- c("salvadorenh*", "nicaragu*", "latin", "guatemal*", "mexic*", "celac", "alalc", "aladi")
latin <- as.data.frame(str_detect(df$ResumoTese, paste(latam, collapse = "|"))) 
colnames(latin) <- 'latin'
latin2 <- as.data.frame(str_detect(df$TituloTese, paste(latam, collapse = "|"))) 
colnames(latin2) <- 'latin2'
latin3 <- as.data.frame(str_detect(df$PalavrasChave, paste(latam, collapse = "|"))) 
colnames(latin3) <- 'latin3'

amsul <- c("sulameric*", "sul-americ*", "chil", "chile", "boliv*", "bolívia", "peru", "colomb*", "venez*", "guyan*", "suri", "unasul", "amazon*")
sul <- as.data.frame(str_detect(df$ResumoTese, paste(amsul, collapse = "|"))) 
colnames(sul) <- 'sul'
sul2 <- as.data.frame(str_detect(df$TituloTese, paste(amsul, collapse = "|"))) 
colnames(sul2) <- 'sul2'
sul3 <- as.data.frame(str_detect(df$PalavrasChave, paste(amsul, collapse = "|"))) 
colnames(sul3) <- 'sul3'

intera <- c("oea", "interam*", "panamer*", "inter-ameri*", "pan-americ*")
interam <- as.data.frame(str_detect(df$ResumoTese, paste(intera, collapse = "|"))) 
colnames(interam) <- 'interam'
interam2 <- as.data.frame(str_detect(df$TituloTese, paste(intera, collapse = "|"))) 
colnames(interam2) <- 'interam2'
interam3 <- as.data.frame(str_detect(df$PalavrasChave, paste(intera, collapse = "|"))) 
colnames(interam3) <- 'interam3'

cone <- c("merco", "mercosul", "argentin*", "urugu*", "paragu*", "iguacu", "iguaçu", "montevide*")
merco <- as.data.frame(str_detect(df$ResumoTese, paste(cone, collapse = "|"))) 
colnames(merco) <- 'merco'
merco2 <- as.data.frame(str_detect(df$TituloTese, paste(cone, collapse = "|"))) 
colnames(merco2) <- 'merco2'
merco3 <- as.data.frame(str_detect(df$PalavrasChave, paste(cone, collapse = "|"))) 
colnames(merco3) <- 'merco3'

ot <- c("china","ásia", "russia", "rússia", "japão", "estados unidos", 
        "áfrica",  "euro*", "européia", "frança",
        "alemanha", "inglaterra")
other <- as.data.frame(str_detect(df$ResumoTese, paste(ot, collapse = "|"))) 
colnames(other) <- 'other'
other2 <- as.data.frame(str_detect(df$TituloTese, paste(ot, collapse = "|"))) 
colnames(other2) <- 'other2'
other3 <- as.data.frame(str_detect(df$PalavrasChave, paste(ot, collapse = "|"))) 
colnames(other3) <- 'other3'


terms <- cbind(br, br2, br3, latin, latin2, latin3, sul, sul2, sul3, 
               interam, interam2, interam3, merco, merco2, merco3,
               other, other2, other3)

terms$scope <- "None/Other"

## Time to compare different scopes and attribute values according to the broaders scope

terms$scope[terms$brasil == "TRUE" | terms$brasil2 == "TRUE" | terms$brasil3 == "TRUE"] <- "Brazil"
terms$scope[terms$merco == "TRUE" | terms$merco2 == "TRUE" | terms$merco3 == "TRUE"] <- "Southern Cone"
terms$scope[terms$sul == "TRUE" | terms$sul2 == "TRUE" | terms$sul3 == "TRUE"] <- "South America"
terms$scope[terms$latin == "TRUE" | terms$latin2 == "TRUE" | terms$latin3 == "TRUE"] <- "Latin America"
terms$scope[terms$interam == "TRUE" | terms$interam2 == "TRUE" | terms$interam3 == "TRUE"] <- "Inter-American"
terms$num <- 1
term <- aggregate(num ~ scope, sum, data=terms) %>%
  mutate(prop = num/2417)

## Figure 12 - Geographic scope of the Brazilian IR theses and dissertations

term %>%
  ggplot(aes(x=reorder(scope, num), y=prop)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x="Geographic scope", y="Proportion of theses and dissertations") +
  theme_bw() +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure12.png", height = 4.5, width = 6, units = "in")


## Figure 13 - The number of concluded theses and dissertations that mentioned neither Brazil nor its region

breg <- c(bras, latam, amsul, intera, cone)

dfz <- df2 %>% mutate(rn = row.names(.))
terms$rn <- row.names(terms)
dfz <- dfz %>% left_join(terms, by="rn") %>% distinct() %>%
  filter(scope == "None/Other") %>% mutate(Total = 1)

dfzag <- aggregate(Total ~ AnoBase, sum, data=dfz)
dfzz <- df %>% mutate(Total =1)
dfzz <- aggregate(Total ~ AnoBase, sum, data=dfzz) %>% rename(total = Total)
dfzag <- dfzag %>% left_join(dfzz, by="AnoBase") %>% distinct() %>%
  mutate(Proportion = (Total/total)*100) %>% select(-total)
dfzag <- dfzag %>% melt(id.vars = "AnoBase")
dfzag %>% ggplot(aes(AnoBase, value, linetype=variable, shape=variable)) + 
  geom_line() + 
  theme_bw() + labs(x="Year", linetype = "Indicator", shape = "Indicator")+
  scale_y_continuous("Total of documents", 
                     sec.axis = sec_axis(trans = ~./100, 
                                         name = "Proportion of documents")) +
  theme(legend.position = "bottom") +
  theme(text=element_text(size=10,  family="Cambria"),
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(0, 0, 0, 0))

ggsave(filename = "Figure13.png", height = 4.5, width = 7.5, units = "in")

